### Replication code
### Article: "Turnover: How lame-duck governments disrupt the bureaucracy and service delivery before leaving office"
### Author: Guillermo Toral (www.guillermotoral.com)
### Date: July 2023
### This file merges the datasets on elections, employment, and healthcare, and creates a codebook. It also creates a codebook for the file transitions_around_the_world.csv
### This file uses the datasets created by the code in prepare_data_elections.R, prepare_data_employment.R, and prepare_data_healthcare.R
### R version, platform, and package versions reported at the end of the file

# Prepare the environment -------------------------------------------------

### This section of the code prepares the environment 

# Clean the environment
rm(list = ls())

# Install required packages if not previously installed
package_list <- c("tidyverse", "here", "codebook") 
packages_to_install <- package_list[!(package_list %in% installed.packages()[,"Package"])]
if(length(packages_to_install)>0){
  install.packages(packages_to_install)
}

# Load required packages
library(tidyverse)
library(here)
library(codebook)

# Set Working Directory to wherever this file is located.
setwd(here())

# Load, merge and export data: All employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_all_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_all_employees.csv")

# Load, merge and export data: Healthcare employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_healthcare_employees.csv")

# Load, merge and export data: Education employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_education_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_education_employees.csv")

# Load, merge and export data: Manager employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_managers_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_managers_employees.csv")

# Load, merge and export data: Non-manager employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_nonmanagers_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_nonmanagers_employees.csv")

# Load, merge and export data: High-pay employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_highpay_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_highpay_employees.csv")

# Load, merge and export data: Low-pay employees ----------------------------------------------------------

# Employment dataset, built by "prepare_data_employment.R"
e <- read_csv("../../datasets/analysis/employment_lowpay_data.csv") %>%
  mutate(share_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         total_bureaucrats = stock_con_q15 + stock_tem_q15,
         log_bureaucrats = log(total_bureaucrats+1))
# Election dataset, built by "prepare_data_elections.R"
el <- read_csv("../../datasets/analysis/election_data.csv")
# Healthcare dataset, built by "prepare_data_healthcare.R"
hc <- read_csv("../../datasets/analysis/healthcare_data.csv")
# Healthcare personnel dataset, built by "prepare_data_elections.R" with the subset of municipal employees who are healthcare professionals
hcp <- read_csv("../../datasets/analysis/employment_healthcare_data.csv") %>%
  mutate(share_healthcare_civilservice = stock_con_q15/(stock_con_q15+stock_tem_q15),
         hcs_all = ifelse(share_healthcare_civilservice==1,1,0),
         ht_all = ifelse(share_healthcare_civilservice==0,1,0)) %>%
  dplyr::select(cod_ibge, year, share_healthcare_civilservice, hcs_all, ht_all)
# municipality covariates, downloaded from base dos dados (SQL code in the README file)
cv <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_covariates.csv") %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)),
         gdp_per_capita = gdppercapita) %>%
  dplyr::select(c(cod_ibge, year, gdp, population, gdp_per_capita, deaths))
# municipal development index, downloaded from Base dos Dados (SQL code in the README file)
undp <- read_csv("../../datasets/downloaded/other/basedosdados_municipality_development_index.csv") %>%
  filter(year==2010) %>%
  mutate(cod_ibge = as.numeric(substr(ibge,1,6)), year=2016) %>%
  dplyr::select(c(cod_ibge, year, human_development_index))
# municipality area, downloaded from IBGE
area <- read_csv("../../datasets/downloaded/other/ibge_municipality_area.csv") %>%
  mutate(cod_ibge = as.numeric(substr(CD_GCMUN,1,6)),
         area = as.numeric(AR_MUN_2019),
         area_low = ifelse(area < median(area,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, area, area_low)

# merge datasets
e <- left_join(e, el) %>%
  left_join(hc) %>%
  left_join(hcp) %>%
  left_join(cv) %>%
  left_join(undp) %>%
  left_join(area)

# Add covariates
e <- e %>%
  mutate(north = ifelse(substr(cod_ibge,1,1)==1,1,0), # region fixed effects
         northeast = ifelse(substr(cod_ibge,1,1)==2,1,0),
         southeast = ifelse(substr(cod_ibge,1,1)==3,1,0),
         south = ifelse(substr(cod_ibge,1,1)==4,1,0),
         centerwest = ifelse(substr(cod_ibge,1,1)==5,1,0),
         log_population = log(population),
         log_gdp = log(gdp),
         log_gdp_per_capita = log(gdp_per_capita),
         deaths_per_thousand = deaths / (population/1000),
         log_deaths = log(deaths),
         deaths_per_thousand = deaths/(population/1000),
         turnout = total_votes/population,
         bureaucrats_per_capita = total_bureaucrats/population) %>%
  dplyr::select(-c(gdp, deaths))

# Add indicator for low GDP per capita
gdp2004 <- cv %>%
  dplyr::filter(year==2004) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  dplyr::select(cod_ibge, year, gdp_per_capita_low)

gdp2008 <- cv %>%
  filter(year==2008) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2012 <- cv %>%
  filter(year==2012) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp2016 <- cv %>%
  filter(year==2016) %>%
  mutate(gdp_per_capita_low = ifelse(gdp_per_capita<median(gdp_per_capita,na.rm=T),1,0)) %>%
  select(cod_ibge, year, gdp_per_capita_low)

gdp_per_capita <- rbind(gdp2004, gdp2008, gdp2012, gdp2016)

e <- left_join(e, gdp_per_capita)

# Add indicator for whether municipality was exposed to CGU audit in the first 3 years of a local administration

cgu <- read_csv("../../datasets/downloaded/other/basedosdados_cgu_audits.csv") %>%
  dplyr::filter(!is.na(as.numeric(sorteio_ciclo_fef))) # remove waves for which assignment to audit was not always random (after 2015)
cgu$cod_ibge <- as.numeric(substr(cgu$id_municipio,1,6))
table(cgu$ano_evento) # year of the assignment to audit

cgu_2008 <- subset(cgu, cgu$ano_evento > 2004 & cgu$ano_evento < 2008)
cgu_2012 <- subset(cgu, cgu$ano_evento > 2008 & cgu$ano_evento < 2012)
cgu_2016 <- subset(cgu, cgu$ano_evento > 2012 & cgu$ano_evento < 2016)

e2004 <- subset(e, e$year==2004)
e2008 <- subset(e, e$year==2008)
e2012 <- subset(e, e$year==2012)
e2016 <- subset(e, e$year==2016)

# Create random indicator
e2004$cgu_audit <- 0 # no lotteries were random before 2004
e2008$cgu_audit <- ifelse(e2008$cod_ibge %in% cgu_2008$cod_ibge,1,0)
e2012$cgu_audit <- ifelse(e2012$cod_ibge %in% cgu_2012$cod_ibge,1,0)
e2016$cgu_audit <- ifelse(e2016$cod_ibge %in% cgu_2016$cod_ibge,1,0)

e <- rbind(e2004, e2008, e2012, e2016) 
table(e$cgu_audit)

write_csv(e, "../../datasets/analysis/analysis_dataset_lowpay_employees.csv")

# Create codebook for analysis datasets ---------------------------------------------------------

# Load the analysis dataset with data for all employees -- variables and codebook coincide for all other analysis datasets, regardless of the subgroup of municipal employees they use
e <- read_csv("../../datasets/analysis/analysis_dataset_all_employees.csv")

var_label(e) <- list(cod_ibge = "Municipality identifier (generated by IBGE) (NA = missing)",
  year = "Election cycle year",
  year_2004 = "Indicator for observations corresponding to the 2004 election cycle",
  year_2008 = "Indicator for observations corresponding to the 2008 election cycle",
  year_2012 = "Indicator for observations corresponding to the 2012 election cycle",
  year_2016 = "Indicator for observations corresponding to the 2016 election cycle",
  # Hires temporaries
  hires_tem_q14 = "Number of municipal employees on temporary contracts who were hired in Q14 (April - June of an election year) (NA = missing)",
  hires_tem_q15 = "Number of municipal employees on temporary contracts who were hired in Q15 (July - September of an election year) (NA = missing)",
  hires_tem_q16 = "Number of municipal employees on temporary contracts who were hired in Q16 (October - December of an election year) (NA = missing)",
  hires_tem_q1 = "Number of municipal employees on temporary contracts who were hired in Q1 (January - March after an election) (NA = missing)",
  # Dismissals temporaries
  fires_tem_q14 = "Number of municipal employees on temporary contracts who were dismissed in Q14 (April - June of an election year) (NA = missing)",
  fires_tem_q15 = "Number of municipal employees on temporary contracts who were dismissed in Q15 (July - September of an election year) (NA = missing)",
  fires_tem_q16 = "Number of municipal employees on temporary contracts who were dismissed in Q16 (October - December of an election year) (NA = missing)",
  fires_tem_q1 = "Number of municipal employees on temporary contracts who were dismissed in Q1 (January - March after an election) (NA = missing)",
  # Resignations temporaries
  resignations_tem_q14 = "Number of municipal employees on temporary contracts who resigned in Q14 (April - June of an election year) (NA = missing)",
  resignations_tem_q15 = "Number of municipal employees on temporary contracts who resigned in Q15 (July - September of an election year) (NA = missing)",
  resignations_tem_q16 = "Number of municipal employees on temporary contracts who resigned in Q16 (October - December of an election year) (NA = missing)",
  resignations_tem_q1 = "Number of municipal employees on temporary contracts who resigned in Q1 (January - March after an election) (NA = missing)",
  # Hires civil servants
  hires_con_q14 = "Number of municipal employees on civil service contracts who were hired in Q14 (April - June of an election year) (NA = missing)",
  hires_con_q15 = "Number of municipal employees on civil service contracts who were hired in Q15 (July - September of an election year) (NA = missing)",
  hires_con_q16 = "Number of municipal employees on civil service contracts who were hired in Q16 (October - December of an election year) (NA = missing)",
  hires_con_q1 = "Number of municipal employees on civil service contracts who were hired in Q1 (January - March after an election) (NA = missing)",
  # Dismissals civil servants
  fires_con_q14 = "Number of municipal employees on civil service contracts who were dismissed in Q14 (April - June of an election year) (NA = missing)",
  fires_con_q15 = "Number of municipal employees on civil service contracts who were dismissed in Q15 (July - September of an election year) (NA = missing)",
  fires_con_q16 = "Number of municipal employees on civil service contracts who were dismissed in Q16 (October - December of an election year) (NA = missing)",
  fires_con_q1 = "Number of municipal employees on civil service contracts who were dismissed in Q1 (January - March after an election) (NA = missing)",
  # Resignations civil servants
  resignations_con_q14 = "Number of municipal employees on civil service contracts who resigned in Q14 (April - June of an election year) (NA = missing)",
  resignations_con_q15 = "Number of municipal employees on civil service contracts who resigned in Q15 (July - September of an election year) (NA = missing)",
  resignations_con_q16 = "Number of municipal employees on civil service contracts who resigned in Q16 (October - December of an election year) (NA = missing)",
  resignations_con_q1 = "Number of municipal employees on civil service contracts who resigned in Q1 (January - March after an election) (NA = missing)",
  # Overall bureaucracy numbers
  stock_tem_q15 = "Total number of municipal employees on temporary contracts in Q15 (July - September of an election year) (NA = missing)",
  stock_con_q15 = "Total number of municipal employees on civil service contracts in Q15 (July - September of an election year) (NA = missing)",
  share_civilservice = "Share of municipal employees on civil service contracts in Q15 (July - September of an election year) (NA = missing)",
  total_bureaucrats = "Total number of municipal employees in Q15 (July - September of an election year) (NA = missing)",
  log_bureaucrats = "Total logged number of municipal employees in Q15 (July - September of an election year) (NA = missing)",
  bureaucrats_per_capita = "Number of municipal employees per capita (total_bureaucrats divided by population) (NA = missing)",
  # Election variables
  challenger_margin = "Margin in the vote share of the main challenger, relative to the incumbent's (NA = missing)",
  incumbent_party_pt = "Indicator for whether the incumbent mayor was elected under PT (NA = missing)",
  incumbent_party_psdb = "Indicator for whether the incumbent mayor was elected under PSDB (NA = missing)",
  incumbent_party_pmdb = "Indicator for whether the incumbent mayor was elected under MDB (NA = missing)",
  incumbent_party_large = "Indicator for whether the incumbent mayor was elected under PT, PSDB, MDB, or PP (NA = missing)",
  incumbent_party_aligned = "Indicator for whether the incumbent mayor was elected under the party controlling the presidency (NA = missing)",
  total_votes = "Total number of votes cast in the municipal election (NA = missing)",
  turnout = "Turnout in the election (total_votes divided by population) (NA = missing)",
  electoral_concentration = "Herfindahl index of electoral concentration in the election (ranges from 0 to 1, with higher values denoting more concentration of votes in less candidates) (NA = missing)",
  # Household visits by community health agents (ACS)
  acs_visits_q14 = "Number of household visits performed by community health agents in Q14 (April - June of an election year) (NA = missing)",
  acs_visits_q15 = "Number of household visits performed by community health agents in Q15 (July - September of an election year) (NA = missing)",
  acs_visits_q16 = "Number of household visits performed by community health agents in Q16 (October - December of an election year) (NA = missing)",
  acs_visits_q1 = "Number of household visits performed by community health agents in Q1 (January - March after an election) (NA = missing)",
  # Household visits by nurses
  nurse_visits_q14 = "Number of household visits performed by nurses in Q14 (April - June of an election year) (NA = missing)",
  nurse_visits_q15 = "Number of household visits performed by nurses in Q15 (July - September of an election year) (NA = missing)",
  nurse_visits_q16 = "Number of household visits performed by nurses in Q16 (October - December of an election year) (NA = missing)",
  nurse_visits_q1 = "Number of household visits performed by nurses in Q1 (January - March after an election) (NA = missing)",
  # Household visits by doctors
  doctor_visits_q14 = "Number of household visits performed by doctors in Q14 (April - June of an election year) (NA = missing)",
  doctor_visits_q15 = "Number of household visits performed by doctors in Q15 (July - September of an election year) (NA = missing)",
  doctor_visits_q16 = "Number of household visits performed by doctors in Q16 (October - December of an election year) (NA = missing)",
  doctor_visits_q1 = "Number of household visits performed by doctors in Q1 (January - March after an election) (NA = missing)",
  # Prenatal care check-ups
  prenatal_checkups_q14 = "Number of prenatal care checkups in Q14 (April - June of an election year) (NA = missing)",
  prenatal_checkups_q15 = "Number of prenatal care checkups in Q15 (July - September of an election year) (NA = missing)",
  prenatal_checkups_q16 = "Number of prenatal care checkups in Q16 (October - December of an election year) (NA = missing)",
  prenatal_checkups_q1 = "Number of prenatal care checkups in Q1 (January - March after an election) (NA = missing)",
  # Medical consultations with infants
  infant_checkups_q14 = "Number of medical consultations with infants (0-1 years old) in Q14 (April - June of an election year) (NA = missing)",
  infant_checkups_q15 = "Number of medical consultations with infants (0-1 years old) in Q15 (July - September of an election year) (NA = missing)",
  infant_checkups_q16 = "Number of medical consultations with infants (0-1 years old) in Q16 (October - December of an election year) (NA = missing)",
  infant_checkups_q1 = "Number of medical consultations with infants (0-1 years old) in Q1 (January - March after an election) (NA = missing)",
  # Medical consultations with children
  child_checkups_q14 = "Number of medical consultations with children (1-5 years old) in Q14 (April - June of an election year) (NA = missing)",
  child_checkups_q15 = "Number of medical consultations with children (1-5 years old) in Q15 (July - September of an election year) (NA = missing)",
  child_checkups_q16 = "Number of medical consultations with children (1-5 years old) in Q16 (October - December of an election year) (NA = missing)",
  child_checkups_q1 = "Number of medical consultations with children (1-5 years old) in Q1 (January - March after an election) (NA = missing)",
  # Pregnant women with their vaccines up to date
  pregnancies_vaccinated_q14 = "Number of pregnant women with their vaccines up to date in Q14 (April - June of an election year) (NA = missing)",
  pregnancies_vaccinated_q15 = "Number of pregnant women with their vaccines up to date in Q15 (July - September of an election year) (NA = missing)",
  pregnancies_vaccinated_q16 = "Number of pregnant women with their vaccines up to date in Q16 (October - December of an election year) (NA = missing)",
  pregnancies_vaccinated_q1 = "Number of pregnant women with their vaccines up to date in Q1 (January - March after an election) (NA = missing)",
  # infants with their vaccines up to date
  infants_vaccinated_q14 = "Number of infants with their vaccines up to date in Q14 (April - June of an election year) (NA = missing)",
  infants_vaccinated_q15 = "Number of infants with their vaccines up to date in Q15 (July - September of an election year) (NA = missing)",
  infants_vaccinated_q16 = "Number of infants with their vaccines up to date in Q16 (October - December of an election year) (NA = missing)",
  infants_vaccinated_q1 = "Number of infants with their vaccines up to date in Q1 (January - March after an election) (NA = missing)",
  # babies born
  births_q14 = "Number of babies born in Q14 (April - June of an election year) (NA = missing)",
  births_q15 = "Number of babies born in Q15 (July - September of an election year) (NA = missing)",
  births_q16 = "Number of babies born in Q16 (October - December of an election year) (NA = missing)",
  births_q1 = "Number of babies born in Q1 (January - March after an election) (NA = missing)",
  # babies born with low weight
  births_lowweight_q14 = "Number of underweight babies bortn (< 2,500 grams) in Q14 (April - June of an election year) (NA = missing)",
  births_lowweight_q15 = "Number of underweight babies bortn (< 2,500 grams) in Q15 (July - September of an election year) (NA = missing)",
  births_lowweight_q16 = "Number of underweight babies bortn (< 2,500 grams) in Q16 (October - December of an election year) (NA = missing)",
  births_lowweight_q1 = "Number of underweight babies bortn (< 2,500 grams) in Q1 (January - March after an election) (NA = missing)",
  # babies 0-1 year old
  babies_below1yr_q14 = "Number of babies 0-1 year old in Q14 (April - June of an election year) (NA = missing)",
  babies_below1yr_q15 = "Number of babies 0-1 year old in Q15 (July - September of an election year) (NA = missing)",
  babies_below1yr_q16 = "Number of babies 0-1 year old in Q16 (October - December of an election year) (NA = missing)",
  babies_below1yr_q1 = "Number of babies 0-1 year old in Q1 (January - March after an election) (NA = missing)",
  # change in stock of healthcare personnel
  flow_healthcare_personnel_q14 = "Change in the stock of healthcare personnel in Q14 (April - June of an election year) (NA = missing)",
  flow_healthcare_personnel_q15 = "Change in the stock of healthcare personnel in Q15 (July - September of an election year) (NA = missing)",
  flow_healthcare_personnel_q16 = "Change in the stock of healthcare personnel in Q16 (October - December of an election year) (NA = missing)",
  flow_healthcare_personnel_q1 = "Change in the stock of healthcare personnel in Q1 (January - March after an election) (NA = missing)",
  # other variables about healthcare bureaucrats
  share_healthcare_civilservice = "Share of healthcare personnel who are in civil service contracts in Q15 (July - September of an election year) (NA = missing)",
  hcs_all = "Indicator for whether all healthcare personnel is hired on civil service contracts (NA = missing)",
  ht_all = "Indicator for whether all healthcare personnel is hired on temporary contracts (NA = missing)",
  # Municipality covariates
  population = "Municipality population (NA = missing)",
  gdp_per_capita = "Municipality GDP per capita (NA = missing)",
  human_development_index = "Municipality human development index in 2010 (NA = missing)",
  area = "Municipality area",
  area_low = "Indicator for whether the municipality covers an area below the median",
  north = "Indicator for whether the municipality is in the north region",
  northeast = "Indicator for whether the municipality is in the northeast region",
  southeast = "Indicator for whether the municipality is in the southeast region",
  south = "Indicator for whether the municipality is in the south region",
  centerwest = "Indicator for whether the municipality is in the centerwest region",
  log_gdp_per_capita = "Municipality GDP per capita, logged (NA = missing)",
  log_gdp = "Municipality GDP, logged (NA = missing)",
  log_population = "Municipality population in 2015, logged (NA = missing)",
  deaths_per_thousand = "Municipality deaths per 1,000 residents (NA = missing)",
  log_deaths = "Municipality deaths, logged (NA = missing)",
  gdp_per_capita_low = "Indicator for whether the municipality GDP per capita is below the median (NA = missing)",
  cgu_audit = "Indicator for whether the municipality was exposed to a CGU audit in the first 3 years of the mayor's mandate (NA = missing)")

codebook::label_browser_static(e)

# Create codebook for transitions_around_the_world.csv --------------------

t <- read_csv("../../datasets/analysis/transitions_around_the_world.csv")

var_label(t) <- list(country = "Country",
                     year = "Year of the latest election that led to a new party
reaching national-level executive office through popular election – either direct elections in (semi-)presidential
systems, or legislative elections in parliamentary ones",
                    date_election = "Date of the election. The date corresponds to the second round when one was held",
                    date_inauguration = "Date when the election winner took office",
                    transition_length = "Number of days between date_election and date_inauguration")

codebook::label_browser_static(t)

# Notes: R version, platform, and loaded packages -------------------------

sessionInfo(package = NULL)

# R version 4.2.1 (2022-06-23)
# Platform: aarch64-apple-darwin20 (64-bit)
# Running under: macOS Monterey 12.1
# 
# Matrix products: default
# LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
# 
# locale:
#   [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] codebook_0.9.2  here_1.0.1      forcats_0.5.2   stringr_1.5.0   dplyr_1.1.2    
# [6] purrr_0.3.4     readr_2.1.2     tidyr_1.2.0     tibble_3.2.1    ggplot2_3.3.6  
# [11] tidyverse_1.3.2
# 
# loaded via a namespace (and not attached):
#   [1] pillar_1.9.0        compiler_4.2.1      cellranger_1.1.0    dbplyr_2.2.1       
# [5] tools_4.2.1         lubridate_1.8.0     jsonlite_1.8.0      googledrive_2.0.0  
# [9] lifecycle_1.0.3     gargle_1.2.0        gtable_0.3.1        pkgconfig_2.0.3    
# [13] rlang_1.1.1         reprex_2.0.2        DBI_1.1.3           cli_3.6.1          
# [17] rstudioapi_0.14     haven_2.5.1         xml2_1.3.3          withr_2.5.0        
# [21] httr_1.4.4          generics_0.1.3      vctrs_0.6.2         fs_1.5.2           
# [25] hms_1.1.2           rprojroot_2.0.3     googlesheets4_1.0.1 grid_4.2.1         
# [29] tidyselect_1.2.0    glue_1.6.2          R6_2.5.1            fansi_1.0.4        
# [33] readxl_1.4.1        tzdb_0.3.0          modelr_0.1.9        magrittr_2.0.3     
# [37] backports_1.4.1     scales_1.2.1        ellipsis_0.3.2      labelled_2.9.1     
# [41] rvest_1.0.3         assertthat_0.2.1    colorspace_2.0-3    utf8_1.2.3         
# [45] stringi_1.7.12      munsell_0.5.0       broom_1.0.1         crayon_1.5.1     